In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import multiprocessing
from multiprocessing import Pool
num_cores = 8
iris = pd.DataFrame(sns.load_dataset('iris'))
In [7]:
# cpu count
multiprocessing.cpu_count()
Out[7]:
In [8]:
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_cores) # dataframe split
pool = Pool(num_cores) # make Pool
df = pd.concat(pool.map(func, df_split)) # pool에서 map function 후, concat
pool.close() # 닫고
pool.join() # join
return df
In [9]:
def multiply_columns(data):
data['length_of_word'] = data['species'].apply(lambda x: len(x))
return data
In [10]:
%time normal_iris = multiply_columns(iris)
In [11]:
%time parallelize_iris = parallelize_dataframe(iris, multiply_columns)
In [ ]: